import requests;
from lxml import etree;

#定义一个简单的html结构
html='''
<html>
	<div>
		<ul>
			<li class="item-0">
				<a href="link1.html">first item</a>
			</li>
			<li class="item-1">
				<a href="link2.html">second item</a>
			</li>	
			<li class="item-active">
				<a href="link3.html">third item</a>
			</li>	
			<li class="item-1">
				<a href="link4.html">fourth item</a>
			</li>			
			<li class="item-0">
				<a href="link5.html">fifth item</a>
			</li>
			<li class="else-1">something else</li>		
			this is ul item
		</ul>				
	</div>
</html>
''';

selector=etree.HTML(html);#初始化etree

#通过class属性来找到	
a3_text=selector.xpath('//li[@class="item-active"]/a/text()')[0];#本例的属性是唯一的
print(a3_text);

# 通过href属性来查找	*代表所有的标签
a1_text=selector.xpath('//*[@href="link1.html"]/text()')[0];#本例的属性是唯一的
print(a1_text);

#获取 herf中的值
a3_href=selector.xpath('//li[3]/a/@href')[0];
print(a3_href);

#获取所有的class的值
all_class=selector.xpath('//li/@class');
print(all_class);